Data link: https://knights-lab.github.io/MLRepo/docs/hmp.html
Sequening method: 16S rRNA 454 pyrosequencing
Sample description: data collected from 18 body sites across 242 healthy subjects
alpha and beta diversity based on qiime otu_table
PCoA
Machine learning task to classify body sites based on meta and otu_table
shap model prediction
import numpy as np
import pandas as pd
pd.options.display.max_columns = 100
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import *
from sklearn.ensemble import *
from sklearn.svm import SVC
import skbio
import scipy
import lightgbm as lgb
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
%matplotlib inline
meta = pd.read_csv('hmp/mapping-orig.txt', sep='\t')
meta = meta[meta['MISLABELED']!=True]
meta = meta[meta['CONTAMINATED']!=True]
meta.head()
#"HMPBODYSUPERSITE" | HMPBODYSUBSITE
#HOST_SUBJECT_ID: patient id
TARGET_COLUMN = "HMPBODYSUBSITE"
DEL_COLUMNS = ["BarcodeSequence","LinkerPrimerSequence","TARGET_SUBFRAGMENT","RUN_PREFIX","TITLE", "AGE_IN_YEARS",
"ASSIGNED_FROM_GEO","INVESTIGATION_TYPE","HOST_COMMON_NAME","DEPTH","HOST_TAXID","STUDY_CENTER",
"SUBMIT_TO_INSDC","COMMON_NAME","INCLUDES_TIMESERIES","LONGITUDE","SAMP_SIZE","EXPERIMENT_CENTER",
"STUDY_ABSTRACT","BODY_SITE","PROJECT_NAME","ELEVATION","RUN_DATE","HIV","ENV_FEATURE","SAMPLE_CENTER",
"COLLECTION_DATE","ALTITUDE","SEX","ENV_BIOME","COUNTRY","STUDY_TITLE","BODY_HABITAT","STUDY_ID",
"ANONYMIZED_NAME","TAXON_ID","REGION","SEQUENCING_METH","SRS","PLATFORM","VISITNO",
"STUDY_DESCRIPTION","SPECIFIC_BODY_SITE","MISLABELED","AGE_UNIT","MIENS_COMPLIANT","MIENS_COMPLIANT",
"EXPERIMENT_DESIGN_DESCRIPTION","Description_duplicate","ABX_PAST_6_MOS","ENV_MATTER","TARGET_GENE",
"KEY_SEQ","BODY_PRODUCT","PSN","MYOCARDINFARCT","ATHEROSCLEROSIS","SMOKER","STUDY_ALIAS","PCR_PRIMERS",
"LIBRARY_CONSTRUCTION_PROTOCOL","LATITUDE","HYPERTENSION","Description","CONTAMINATED","SAMP_COLLECT_DEVICE",
"HMPBODYSUPERSITE"
]
for col in DEL_COLUMNS:
try:
meta = meta.drop([col],axis=1)
except Exception:
pass
meta.head()
meta[TARGET_COLUMN].value_counts()
### STR => numeric
### Process None data
for col in ["OBESITY","CHRONIC_CONDITION",]:
## highly skewed, more n than y, hence assume None as n
meta[col][meta[col]=='y'] = 1
meta[col][meta[col]=='n'] = 0
meta[col][meta[col]=='None'] = 0
meta[col] = meta[col].astype('int')
for col in ["COLLECTDAY","GENDER"]:
meta[col][meta[col]=='None'] = 0
meta[col] = meta[col].astype('int')
for col in ["RUN_CENTER", "HOST_SUBJECT_ID", "HMPBODYSUBSITE"]:
unique = meta[col].unique()
Dict = dict((unique[i],i) for i in range(len(unique)))
meta[col] = meta[col].map(Dict)
meta[col] = meta[col].astype('int')
for col in ["BODY_MASS_INDEX","AGE","INTPH","PFPH"]:
M = meta[col][meta[col]!="None"].astype('float').mean()
meta[col][meta[col]=="None"] = M
meta[col] = meta[col].astype('float')
meta.head()
def custom_transpose(df, drop_column="#OTU ID"):
df = pd.DataFrame(df.T)
df.columns = df.iloc[0]
df = df.drop([drop_column])
df = df.astype('int')
return df
tax = pd.read_csv('hmp/gg/taxatable.txt', sep='\t')
tax = custom_transpose(tax)
tax = tax.astype('int')
tax = tax.reset_index()
tax = tax.rename(columns={"index": "SampleID"})
tax.head()
def alpha_div(otu_tab, metric='shannon'):
'''Custom function for shannon and simpson index'''
if 'SampleID' in otu_tab.columns:
otu_tab = otu_tab.drop(['SampleID'],axis=1)
otu_tab = otu_tab.astype(int)
rowsums = otu_tab.sum(axis=1)
if metric=='shannon':
otu_p = otu_tab.divide(rowsums, axis=0)
L = otu_p * otu_p.apply(lambda t:np.log2(t))
div = (-1)*L.sum(axis=1)
elif metric=='simpson':
div = 1-((otu_tab*(otu_tab-1)).sum(axis=1)) / (rowsums*(rowsums-1))
div[div.isna()] = 0
return div
#shannon = alpha_div(tax, metric='shannon')
#simpson = alpha_div(tax, metric='simpson')
from skbio.diversity import alpha_diversity, beta_diversity
from skbio import DistanceMatrix
from skbio.tree import nj
from sklearn.neighbors import DistanceMetric
tax = tax[tax['SampleID'].isin(meta['SampleID'])]
tax_t = tax.T.iloc[1:,:]
dist = DistanceMetric.get_metric('braycurtis')
dm = dist.pairwise(tax_t.values.astype(int))
## index OTU_id
otu_ids = ["OTU{}".format(i) for i in range(1,len(tax_t.index)+1)]
## brray curtis distance matrix => neighbor joining
DM = DistanceMatrix(dm, otu_ids)
tree = nj(DM).root_at_midpoint()
alpha_metric = ['berger_parker_d', 'brillouin_d', 'chao1', 'chao1_ci', 'dominance', 'doubles', 'enspie', 'esty_ci', 'faith_pd', 'fisher_alpha', 'gini_index', 'goods_coverage', 'heip_e', 'lladser_pe', 'margalef', 'mcintosh_d', 'mcintosh_e', 'menhinick', 'observed_otus', 'osd', 'pielou_e', 'robbins', 'shannon', 'simpson', 'simpson_e', 'singles', 'strong']
alpha_df = pd.DataFrame(columns=alpha_metric)
for metric in alpha_metric:
try:
'''Non-tree based alpha diversity'''
x = alpha_diversity(metric, tax.drop(['SampleID'],axis=1).values, ids=tax.index)
except Exception:
'''Tree based alpha diversity (faith_pd)'''
x = alpha_diversity(metric, tax.drop(['SampleID'],axis=1).values, ids=tax.index, otu_ids=otu_ids, tree=tree)
alpha_df[metric] = x
alpha_df['SampleID'] = tax['SampleID']
meta_alpha = pd.merge(meta, alpha_df, on='SampleID')
corr = meta_alpha.corr(method='pearson')
def draw_corrplot(corr, figsize=(80,78), cmap="RdBu", annot=True, cbar=False, title="Pearson correlation Matrix",
ylim=(34,1), xlim=(0,33),
):
corr = corr.round(2)
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=figsize)
sns.set(font_scale=4)
ax = sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0, annot=annot,
square=True, linewidths=.5, cbar=cbar)
ax.set_ylim(ylim)
ax.set_xlim(xlim)
plt.title(title)
draw_corrplot(corr)
### beta diversity
bc = beta_diversity('braycurtis', tax.drop(['SampleID'],axis=1).values, tax['SampleID'])
#uw = beta_diversity('unweighted_unifrac', tax.drop(['SampleID'],axis=1).values, tax['SampleID'], otu_ids=otu_ids, tree=tree)
#wf = beta_diversity('weighted_unifrac', tax.drop(['SampleID'],axis=1).values, tax['SampleID'], otu_ids=otu_ids, tree=tree)
#jc = beta_diversity('jaccard', tax.drop(['SampleID'],axis=1).values, tax['SampleID'])
def generate_rowcolors(meta, column="HMPBODYSUBSITE"):
lut = dict(zip(set(meta[column]), sns.hls_palette(len(set(meta[column]+1)), l=0.5, s=0.8)))
row_colors = meta[column].map(lut)
return row_colors
row_colors = pd.concat([
generate_rowcolors(meta_alpha),
generate_rowcolors(meta_alpha, column="GENDER"),
generate_rowcolors(meta_alpha, column="OBESITY"),
generate_rowcolors(meta_alpha, column="CHRONIC_CONDITION"),
],axis=1)
N = 1000
sns.set(font_scale=5)
sns.clustermap(pd.DataFrame(bc.data[:N,:N]), cmap="YlGnBu_r", row_colors=row_colors[:N], yticklabels=False, xticklabels=False, figsize=(80, 75))
from skbio.stats.ordination import pcoa
PCoA = pcoa(bc.data).samples[['PC1','PC2','PC3']]
meta_alpha = meta_alpha[meta_alpha['SampleID'].isin(tax['SampleID'])]
meta_alpha = meta_alpha.reindex()
PCoA.index = meta_alpha.index
meta_pcoa = pd.concat([meta_alpha, PCoA],axis=1)
import plotly.express as px
import colorlover as cl
palette = cl.interp(cl.scales['11']['qual']['Paired'], 18)
COLORMAP = dict(zip([f'{i}' for i in range(18)], palette))
meta_pcoa['HMPBODYSUBSITE'] = meta_pcoa['HMPBODYSUBSITE'].astype(str)
fig = px.scatter_3d(meta_pcoa, x="PC1", y="PC2", z="PC3", color="HMPBODYSUBSITE", symbol="GENDER", size="simpson",opacity=0.7,
color_discrete_map = COLORMAP)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
df = pd.merge(meta_alpha, tax, on="SampleID")
tax = tax[tax['SampleID'].isin(df['SampleID'])]
df.shape
for col in ['chao1_ci','esty_ci','osd']:
df[f'{col}_0'] = df[col].apply(lambda t: t[0])
df[f'{col}_1'] = df[col].apply(lambda t: t[1])
if col =='osd':
df[f'{col}_2'] = df[col].apply(lambda t: t[2])
df = df.drop(['chao1_ci','esty_ci','osd'],axis=1)
def KFold_model(df, MODEL, param, n_splits=5):
n_splits=5
MODEL=RandomForestClassifier
param = dict(max_depth=128, max_features='sqrt', n_estimators=400, bootstrap=False, random_state=42, n_jobs=-1)
folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
oof = np.zeros(len(df))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(df, df['HMPBODYSUBSITE'])):
x_trn, y_trn = df.drop(['SampleID','HMPBODYSUBSITE', 'heip_e', 'lladser_pe', 'margalef', 'mcintosh_d', 'pielou_e'],axis=1).iloc[trn_idx].values, df['HMPBODYSUBSITE'].iloc[trn_idx].values
x_val, y_val = df.drop(['SampleID','HMPBODYSUBSITE', 'heip_e', 'lladser_pe', 'margalef', 'mcintosh_d', 'pielou_e'],axis=1).iloc[val_idx].values, df['HMPBODYSUBSITE'].iloc[val_idx].values
model = MODEL(**param)
model.fit(x_trn, y_trn)
preds = model.predict(x_val)
oof[val_idx] = preds
F1 = f1_score(preds, y_val, average='micro')
print('fold {} f1 score: {}'.format(fold_, F1))
F1 = f1_score(oof, df['HMPBODYSUBSITE'], average='micro')
print('OOF f1 score: {}'.format(F1))
return oof, model
param = dict(max_depth=128, max_features='sqrt', n_estimators=400, bootstrap=False, random_state=42, n_jobs=-1)
oof, model = KFold_model(df, RandomForestClassifier, param)
def KFold_lgb(df, param, n_splits=5, CATEGORICAL=["RUN_CENTER", "GENDER","OBESITY","CHRONIC_CONDITION","HOST_SUBJECT_ID"]):
folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
oof = np.zeros(len(df))
feature_importance_df = pd.DataFrame()
for fold_, (trn_idx, val_idx) in enumerate(folds.split(df, df['HMPBODYSUBSITE'])):
print("fold {}".format(fold_))
trn_data = lgb.Dataset(df.drop(['SampleID','HMPBODYSUBSITE'],axis=1).iloc[trn_idx], label=df['HMPBODYSUBSITE'].iloc[trn_idx],
categorical_feature=CATEGORICAL
)
val_data = lgb.Dataset(df.drop(['SampleID','HMPBODYSUBSITE'],axis=1).iloc[val_idx], label=df['HMPBODYSUBSITE'].iloc[val_idx],
categorical_feature=CATEGORICAL
)
num_round = 10000
clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=500, early_stopping_rounds=100)
preds = clf.predict(df.drop(['SampleID','HMPBODYSUBSITE'],axis=1).iloc[val_idx], num_iteration=clf.best_iteration)
oof[val_idx] = np.argmax(preds, axis=1)
fold_importance_df = pd.DataFrame()
fold_importance_df["Feature"] = list(df.drop(['SampleID','HMPBODYSUBSITE'],axis=1).columns)
fold_importance_df["importance"] = clf.feature_importance()
fold_importance_df["fold"] = fold_ + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
F1 = f1_score(oof[val_idx], df['HMPBODYSUBSITE'].iloc[val_idx], average='micro')
print('fold {} f1 score: {}'.format(fold_, F1))
F1 = f1_score(oof, df['HMPBODYSUBSITE'], average='micro')
print('OOF f1 score: {}'.format(F1))
feature_importance_mean = 0.2*(
feature_importance_df[['Feature', 'importance']][feature_importance_df['fold']==1].set_index('Feature') +
feature_importance_df[['Feature', 'importance']][feature_importance_df['fold']==2].set_index('Feature') +
feature_importance_df[['Feature', 'importance']][feature_importance_df['fold']==3].set_index('Feature') +
feature_importance_df[['Feature', 'importance']][feature_importance_df['fold']==4].set_index('Feature') +
feature_importance_df[['Feature', 'importance']][feature_importance_df['fold']==5].set_index('Feature')
)
return oof, feature_importance_mean, clf, val_idx
param = {'num_leaves': 63,
'min_data_in_leaf': 30,
'objective': 'multiclass',
"metric": 'multi_logloss',
"num_class": 18,
'max_depth': -1,
'learning_rate': 0.005,
"min_child_samples": 20,
"boosting": "gbdt",
"feature_fraction": 0.7,
"bagging_freq": 1,
"bagging_fraction": 0.9,
"bagging_seed": 12,
"reg_lambda": 1,
'reg_alpha': 1,
"verbosity": -1,
"nthread": -1,
"random_state": 1
}
oof, feature_importance_mean, clf, val_idx = KFold_lgb(df, param)
model confused with:
left & right antecubital fossa
left & right retroauricular crease
subgingival plaque & supraginival plaque
throat & palatine tonsil
plt.figure(figsize=(24,24))
sns.set(font_scale=2)
ax = sns.heatmap(confusion_matrix(pd.Series(oof),
df['HMPBODYSUBSITE']),
cmap="YlGnBu", annot=True, cbar=False, annot_kws={"size":18},
linewidths=0.5,fmt="d",
xticklabels=Dict.keys(), yticklabels=Dict.keys()
)
ax.set_ylim(18, -0.01)
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix')
plt.figure(figsize=(15,45))
sns.barplot(x="importance",
y="Feature",
data=feature_importance_mean.sort_values(by="importance", ascending=False)[:50].reset_index())
what feature has the greatest impact on model output?
import shap
valid_x = df.drop(['SampleID','HMPBODYSUBSITE'],axis=1).iloc[val_idx]
shap_values = shap.TreeExplainer(clf).shap_values(valid_x)
shap.summary_plot(shap_values, valid_x, plot_size=(10,20))
# Visualize what feature contributes most to a single class
print(list(Dict.keys())[0])
shap.summary_plot(shap_values[0], valid_x, plot_size=(10,20))
print(list(Dict.keys())[1])
shap.summary_plot(shap_values[1], valid_x, plot_size=(10,20))
@interact
def display_shap_summary(site=list(Dict.keys())):
# Visualize what feature contributes most to each class
# Not able to render properly when downloaded as HTML
idx = np.where(np.array(list(Dict.keys()))==site)[0].tolist()[0]
shap.summary_plot(shap_values[idx], valid_x, plot_size=(10,20))